Flight Fare Analysis
Group G
- Dev Makwana (885064)
- Krina Patel (886861)
- Mahaveersinh Chauhan (884854)
- Trushna Patel (886910)
The goal of this project is to analyze the Airline data to predict the price of the flight fare based on certain parameters such as Airline, Source, Destination, Route, Stops, and so on.
| COLUMN | DESCRIPTION |
|---|---|
| Airline | The airline's name |
| Date_of_Journey | The date of the journey |
| Source | The source from which the service begins |
| Destination | The destination where the service ends |
| Route | The route taken by the flight to reach the destination |
| Dep_Time | The time when the journey starts from the source |
| Arrival_Time | Time of arrival at the destination |
| Duration | Total duration of the flight |
| Total_Stops | Total stops between the source and destination |
| Additional_Info | Additional information about the flight |
| Price | The price of the ticket |
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import KFold,cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')
def check_null_values(data):
"""Check Null Values of the DataFrame
Args:
data (DataFrame): DataFrame Object
Returns:
Object : Return Complete report of the null values
"""
return data.isnull().sum()
def remove_null_values(data):
"""Remove Null Values
Args:
data (DataFrame): DataFrame Object
Returns:
DataFrame : Cleaned DataFrame
"""
data = data.dropna().reset_index(drop=True)
return data
def check_duplicate(data):
"""Check the Duplicate value of the dataframe
Args:
data (DataFrame): DataFrame Object
Returns:
Int: Count of Duplicate Values
"""
return len(data[data.duplicated()])
def remove_duplicate(df):
"""Remove Duplicate Value from the DataFrame
Args:
data (DataFrame): DataFrame Object
Returns:
DataFrame: DataFrame which does not contain duplicate values
"""
df = df.drop_duplicates(ignore_index=True)
return df
def check_unique_values(data):
"""Check Unique Values of the dataset
Args:
data (DataFrame): DataFrame Object
Returns:
None: None
"""
for column in data.columns:
print(column, "-", len(data[column].unique()))
def base_models(X, y, test_size=0.2, random_state=42):
"""Base Model
Args:
X (DataFrame): Feature DataFrame
y (DataFrame): Target DataFrame
test_size (float, optional): Test Size. Defaults to 0.2.
random_state (int, optional): Random State. Defaults to 42.
Returns:
None: None
"""
models = {
'Linear Regression': LinearRegression(),
'Decision Tree Regression': DecisionTreeRegressor(),
'Random Forest Regression': RandomForestRegressor(random_state=random_state),
'KNN Regression': KNeighborsRegressor(),
'AdaBoost Regression': AdaBoostRegressor(),
'XGBoost Regression': XGBRegressor()
}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
evaluation_results = {}
for model_name, model in models.items():
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
mse_test = mean_squared_error(y_test, y_pred)
mae_test = mean_absolute_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2 = r2_score(y_test, y_pred)
r2_t = r2_score(y_train, y_pred_train)
# Store evaluation results in a dictionary
evaluation_results[model_name] = {
'MSE Test': mse_test,
'MAE Test': mae_test,
'MSE Train': mse_train,
'MAE Train': mae_train,
'R2 test': r2,
'R2 train' : r2_t
}
return evaluation_results
def hyperparameter_tuning_XGBoost(X, y, test_size=0.2, random_state=42):
"""Hyper Parameter Turning of XGBoost
Args:
X (DataFrame): Feature DataFrame
y (DataFrame): Target DataFrame
test_size (float, optional): Test Size. Defaults to 0.2.
random_state (int, optional): Random State. Defaults to 42.
Returns:
None: None
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
param_grid = {
'n_estimators': [100, 200, 300, 400, 500, 600, 800],
'max_depth': [3, 5, 10, 20],
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0],
'gamma': [0, 1, 2, 3],
'reg_alpha': [0, 0.1, 0.5, 1],
'reg_lambda': [0, 0.1, 0.5, 1]
}
XG = XGBRegressor(random_state=random_state, n_jobs=-1)
rand_search = RandomizedSearchCV(estimator=XG, param_distributions=param_grid, cv=50, n_jobs=-1, verbose=2)
rand_search.fit(X_train, y_train)
print("Best Parameters:", rand_search.best_params_)
print("Best Score:", rand_search.best_score_)
test_score = rand_search.score(X_test, y_test)
print("Test Set Score:", test_score)
def hyperparameter_tuning_RandomForest(X, y, test_size=0.2, random_state=42):
"""Hyper Parameter Turning of XGBoost
Args:
X (DataFrame): Feature DataFrame
y (DataFrame): Target DataFrame
test_size (float, optional): Test Size. Defaults to 0.2.
random_state (int, optional): Random State. Defaults to 42.
Returns:
None: None
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
param_grid = {
'n_estimators': [300, 500, 800, 1000],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
RF = RandomForestRegressor(random_state=random_state, n_jobs=-1)
rand_search = RandomizedSearchCV(estimator=RF, param_distributions=param_grid, cv=2, n_jobs=-1, verbose=2)
rand_search.fit(X_train, y_train)
print("Best Parameters:", rand_search.best_params_)
print("Best Score:", rand_search.best_score_)
test_score = rand_search.score(X_test, y_test)
print("Test Set Score:", test_score)
def plot_test_pred_graph(y_test, y_pred, title):
"""Plot Scatter Plot of the Predicted values and Original Value
Args:
y_test (DataFrame): Original Data
y_pred (DataFrame): Predicted Data
title (String): Title of the Graph
Returns:
None: None
"""
fig = px.scatter(x=y_pred, y=y_test, labels={"x" : "Predicted Value", "y" : "Original Value"}, trendline="ols")
fig.update_layout(title_text=title, title_x=0.5, width=950, height=650)
fig.show()
dataset = pd.read_excel("flight_fare.xlsx")
dataset.head()
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
| 1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
| 2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
| 3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
| 4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Date_of_Journey 10683 non-null object 2 Source 10683 non-null object 3 Destination 10683 non-null object 4 Route 10682 non-null object 5 Dep_Time 10683 non-null object 6 Arrival_Time 10683 non-null object 7 Duration 10683 non-null object 8 Total_Stops 10682 non-null object 9 Additional_Info 10683 non-null object 10 Price 10683 non-null int64 dtypes: int64(1), object(10) memory usage: 918.2+ KB
dataset.describe()
| Price | |
|---|---|
| count | 10683.000000 |
| mean | 9087.064121 |
| std | 4611.359167 |
| min | 1759.000000 |
| 25% | 5277.000000 |
| 50% | 8372.000000 |
| 75% | 12373.000000 |
| max | 79512.000000 |
dataset.shape
(10683, 11)
check_null_values(dataset)
Airline 0 Date_of_Journey 0 Source 0 Destination 0 Route 1 Dep_Time 0 Arrival_Time 0 Duration 0 Total_Stops 1 Additional_Info 0 Price 0 dtype: int64
dataset = remove_null_values(dataset)
dataset
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
| 1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
| 2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
| 3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
| 4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10677 | Air Asia | 9/04/2019 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 |
| 10678 | Air India | 27/04/2019 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 |
| 10679 | Jet Airways | 27/04/2019 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 |
| 10680 | Vistara | 01/03/2019 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 |
| 10681 | Air India | 9/05/2019 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 |
10682 rows × 11 columns
check_null_values(dataset)
Airline 0 Date_of_Journey 0 Source 0 Destination 0 Route 0 Dep_Time 0 Arrival_Time 0 Duration 0 Total_Stops 0 Additional_Info 0 Price 0 dtype: int64
check_duplicate(dataset)
220
dataset = remove_duplicate(dataset)
dataset
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
| 1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
| 2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
| 3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
| 4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | Air Asia | 9/04/2019 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 |
| 10458 | Air India | 27/04/2019 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 |
| 10459 | Jet Airways | 27/04/2019 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 |
| 10460 | Vistara | 01/03/2019 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 |
| 10461 | Air India | 9/05/2019 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 |
10462 rows × 11 columns
check_duplicate(dataset)
0
eda_dataset = dataset.copy()
eda_dataset
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
| 1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
| 2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
| 3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
| 4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | Air Asia | 9/04/2019 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 |
| 10458 | Air India | 27/04/2019 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 |
| 10459 | Jet Airways | 27/04/2019 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 |
| 10460 | Vistara | 01/03/2019 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 |
| 10461 | Air India | 9/05/2019 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 |
10462 rows × 11 columns
total_price_by_airlines = eda_dataset.groupby(["Airline"]).agg({"Price" : "sum"}).reset_index()
fig = px.bar(total_price_by_airlines, x='Airline', y='Price', text_auto='.2s', labels={"Airline" : "Airline", "Price" : "Price"})
fig.update_layout(title_text='Total Price by Airlines', title_x=0.5)
fig.show()
fig = px.box(eda_dataset, x='Airline', y='Price', labels={"Airline" : "Airline", "Price" : "Price"})
fig.update_layout(title_text='Price Distribution over airlines', title_x=0.5)
fig.show()
mean_price_by_airlines = eda_dataset.groupby(["Airline"]).agg({"Price" : "mean"}).reset_index()
fig = px.bar(mean_price_by_airlines, x='Airline', y='Price', text_auto='.2s', labels={"Airline" : "Airline", "Price" : "Price"})
fig.update_layout(title_text='Average Price by Airlines', title_x=0.5)
fig.show()
mean_price_by_stops = eda_dataset.groupby(["Airline", "Total_Stops"]).agg({"Price" : "mean"}).reset_index()
fig = px.bar(mean_price_by_stops, x='Total_Stops', y='Price', color="Airline", text_auto='.2s', labels={"Airline" : "Airline", "Price" : "Price", "Total_Stops":"Number of Stops"})
fig.update_layout(title_text='Average Price of all Airlines by Stops Count', title_x=0.5)
fig.show()
mean_price_by_source = eda_dataset.groupby(["Source"]).agg({"Price" : "mean"}).reset_index()
fig = go.Figure(data=[go.Pie(labels=mean_price_by_source["Source"], values=mean_price_by_source["Price"], pull=[0, 0, 0.2, 0, 0], textposition='inside', textinfo='label+value+percent')])
fig.update_layout(title_text='Average Price by Departure Location', title_x=0.5, width=950, height=650)
fig.show()
eda_dataset["Journey_day"] = pd.to_datetime(eda_dataset["Date_of_Journey"], format="%d/%m/%Y").dt.day
eda_dataset["Journey_month"] = pd.to_datetime(eda_dataset["Date_of_Journey"], format = "%d/%m/%Y").dt.month
price_by_month = eda_dataset.groupby(["Journey_day"]).agg({"Price":"mean"}).reset_index()
fig = px.line(price_by_month, x="Journey_day", y="Price", labels={"Journey_day" : "Journey Day", "Price" : "Price", "Airline": "Airline"})
fig.update_layout(title_text='Average Price according to the day of the month', title_x=0.5)
fig.show()
eda_dataset["Dep_Time"] = pd.to_datetime(eda_dataset["Dep_Time"]).dt.time
price_by_time = eda_dataset.groupby(["Dep_Time"]).agg({"Price":"mean"}).reset_index()
price_by_time = price_by_time.sort_values(by="Dep_Time")
fig = px.line(price_by_time, x="Dep_Time", y="Price", labels={"Dep_Time" : "Departure Time", "Price" : "Price", "Airline": "Airline"})
fig.update_layout(title_text='Price Distribution Over Time', title_x=0.5)
fig.show()
featured_dataset = dataset.copy()
featured_dataset["Journey_day"] = pd.to_datetime(featured_dataset["Date_of_Journey"], format="%d/%m/%Y").dt.day
featured_dataset["Journey_month"] = pd.to_datetime(featured_dataset["Date_of_Journey"], format="%d/%m/%Y").dt.month
featured_dataset
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Journey_day | Journey_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 | 24 | 3 |
| 1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 |
| 2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 | 9 | 6 |
| 3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 | 12 | 5 |
| 4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 | 1 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | Air Asia | 9/04/2019 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 | 9 | 4 |
| 10458 | Air India | 27/04/2019 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 | 27 | 4 |
| 10459 | Jet Airways | 27/04/2019 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 | 27 | 4 |
| 10460 | Vistara | 01/03/2019 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 | 1 | 3 |
| 10461 | Air India | 9/05/2019 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 | 9 | 5 |
10462 rows × 13 columns
featured_dataset.drop(["Date_of_Journey"], axis = 1, inplace = True)
featured_dataset
| Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Journey_day | Journey_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 | 24 | 3 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 |
| 2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 | 9 | 6 |
| 3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 | 12 | 5 |
| 4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 | 1 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | Air Asia | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 | 9 | 4 |
| 10458 | Air India | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 | 27 | 4 |
| 10459 | Jet Airways | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 | 27 | 4 |
| 10460 | Vistara | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 | 1 | 3 |
| 10461 | Air India | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 | 9 | 5 |
10462 rows × 12 columns
featured_dataset["Dep_hour"] = pd.to_datetime(featured_dataset["Dep_Time"]).dt.hour
featured_dataset["Dep_min"] = pd.to_datetime(featured_dataset["Dep_Time"]).dt.minute
featured_dataset.drop(["Dep_Time"], axis = 1, inplace = True)
featured_dataset
| Airline | Source | Destination | Route | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | Journey_day | Journey_month | Dep_hour | Dep_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 22 | 20 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 13:15 | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 5 | 50 |
| 2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 04:25 10 Jun | 19h | 2 stops | No info | 13882 | 9 | 6 | 9 | 25 |
| 3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 23:30 | 5h 25m | 1 stop | No info | 6218 | 12 | 5 | 18 | 5 |
| 4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 21:35 | 4h 45m | 1 stop | No info | 13302 | 1 | 3 | 16 | 50 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | Air Asia | Kolkata | Banglore | CCU → BLR | 22:25 | 2h 30m | non-stop | No info | 4107 | 9 | 4 | 19 | 55 |
| 10458 | Air India | Kolkata | Banglore | CCU → BLR | 23:20 | 2h 35m | non-stop | No info | 4145 | 27 | 4 | 20 | 45 |
| 10459 | Jet Airways | Banglore | Delhi | BLR → DEL | 11:20 | 3h | non-stop | No info | 7229 | 27 | 4 | 8 | 20 |
| 10460 | Vistara | Banglore | New Delhi | BLR → DEL | 14:10 | 2h 40m | non-stop | No info | 12648 | 1 | 3 | 11 | 30 |
| 10461 | Air India | Delhi | Cochin | DEL → GOI → BOM → COK | 19:15 | 8h 20m | 2 stops | No info | 11753 | 9 | 5 | 10 | 55 |
10462 rows × 13 columns
featured_dataset["Arrival_hour"] = pd.to_datetime(featured_dataset["Arrival_Time"]).dt.hour
featured_dataset["Arrival_min"] = pd.to_datetime(featured_dataset["Arrival_Time"]).dt.minute
featured_dataset.drop(["Arrival_Time"], axis = 1, inplace = True)
featured_dataset
| Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Journey_day | Journey_month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 22 | 20 | 1 | 10 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 5 | 50 | 13 | 15 |
| 2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 19h | 2 stops | No info | 13882 | 9 | 6 | 9 | 25 | 4 | 25 |
| 3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 5h 25m | 1 stop | No info | 6218 | 12 | 5 | 18 | 5 | 23 | 30 |
| 4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 4h 45m | 1 stop | No info | 13302 | 1 | 3 | 16 | 50 | 21 | 35 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | Air Asia | Kolkata | Banglore | CCU → BLR | 2h 30m | non-stop | No info | 4107 | 9 | 4 | 19 | 55 | 22 | 25 |
| 10458 | Air India | Kolkata | Banglore | CCU → BLR | 2h 35m | non-stop | No info | 4145 | 27 | 4 | 20 | 45 | 23 | 20 |
| 10459 | Jet Airways | Banglore | Delhi | BLR → DEL | 3h | non-stop | No info | 7229 | 27 | 4 | 8 | 20 | 11 | 20 |
| 10460 | Vistara | Banglore | New Delhi | BLR → DEL | 2h 40m | non-stop | No info | 12648 | 1 | 3 | 11 | 30 | 14 | 10 |
| 10461 | Air India | Delhi | Cochin | DEL → GOI → BOM → COK | 8h 20m | 2 stops | No info | 11753 | 9 | 5 | 10 | 55 | 19 | 15 |
10462 rows × 14 columns
featured_dataset[["Duration"]]
| Duration | |
|---|---|
| 0 | 2h 50m |
| 1 | 7h 25m |
| 2 | 19h |
| 3 | 5h 25m |
| 4 | 4h 45m |
| ... | ... |
| 10457 | 2h 30m |
| 10458 | 2h 35m |
| 10459 | 3h |
| 10460 | 2h 40m |
| 10461 | 8h 20m |
10462 rows × 1 columns
duration = list(featured_dataset["Duration"])
for i in range(len(duration)):
if len(duration[i].split()) != 2:
if "h" in duration[i]:
duration[i] = duration[i].strip() + " 0m"
else:
duration[i] = "0h " + duration[i]
duration_hours = []
duration_mins = []
for i in range(len(duration)):
duration_hours.append(int(duration[i].split(sep = "h")[0]))
duration_mins.append(int(duration[i].split(sep = "m")[0].split()[-1]))
featured_dataset["Duration_hours"] = duration_hours
featured_dataset["Duration_mins"] = duration_mins
featured_dataset
| Airline | Source | Destination | Route | Duration | Total_Stops | Additional_Info | Price | Journey_day | Journey_month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 2h 50m | non-stop | No info | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 7h 25m | 2 stops | No info | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 |
| 2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 19h | 2 stops | No info | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 |
| 3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 5h 25m | 1 stop | No info | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 |
| 4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 4h 45m | 1 stop | No info | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | Air Asia | Kolkata | Banglore | CCU → BLR | 2h 30m | non-stop | No info | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 2 | 30 |
| 10458 | Air India | Kolkata | Banglore | CCU → BLR | 2h 35m | non-stop | No info | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 2 | 35 |
| 10459 | Jet Airways | Banglore | Delhi | BLR → DEL | 3h | non-stop | No info | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 3 | 0 |
| 10460 | Vistara | Banglore | New Delhi | BLR → DEL | 2h 40m | non-stop | No info | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 2 | 40 |
| 10461 | Air India | Delhi | Cochin | DEL → GOI → BOM → COK | 8h 20m | 2 stops | No info | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 8 | 20 |
10462 rows × 16 columns
featured_dataset.drop(["Duration"], axis = 1, inplace = True)
featured_dataset
| Airline | Source | Destination | Route | Total_Stops | Additional_Info | Price | Journey_day | Journey_month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | non-stop | No info | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 2 stops | No info | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 |
| 2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 2 stops | No info | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 |
| 3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 1 stop | No info | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 |
| 4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 1 stop | No info | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | Air Asia | Kolkata | Banglore | CCU → BLR | non-stop | No info | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 2 | 30 |
| 10458 | Air India | Kolkata | Banglore | CCU → BLR | non-stop | No info | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 2 | 35 |
| 10459 | Jet Airways | Banglore | Delhi | BLR → DEL | non-stop | No info | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 3 | 0 |
| 10460 | Vistara | Banglore | New Delhi | BLR → DEL | non-stop | No info | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 2 | 40 |
| 10461 | Air India | Delhi | Cochin | DEL → GOI → BOM → COK | 2 stops | No info | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 8 | 20 |
10462 rows × 15 columns
check_unique_values(featured_dataset)
Airline - 12 Source - 5 Destination - 6 Route - 128 Total_Stops - 5 Additional_Info - 10 Price - 1870 Journey_day - 10 Journey_month - 4 Dep_hour - 24 Dep_min - 12 Arrival_hour - 24 Arrival_min - 12 Duration_hours - 44 Duration_mins - 12
featured_dataset["Airline"].value_counts()
Jet Airways 3700 IndiGo 2043 Air India 1694 Multiple carriers 1196 SpiceJet 815 Vistara 478 Air Asia 319 GoAir 194 Multiple carriers Premium economy 13 Jet Airways Business 6 Vistara Premium economy 3 Trujet 1 Name: Airline, dtype: int64
Airline = featured_dataset[["Airline"]]
Airline = pd.get_dummies(Airline, drop_first= True)
Airline
| Airline_Air India | Airline_GoAir | Airline_IndiGo | Airline_Jet Airways | Airline_Jet Airways Business | Airline_Multiple carriers | Airline_Multiple carriers Premium economy | Airline_SpiceJet | Airline_Trujet | Airline_Vistara | Airline_Vistara Premium economy | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10458 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10459 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10460 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 10461 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10462 rows × 11 columns
featured_dataset["Source"].value_counts()
Delhi 4345 Kolkata 2860 Banglore 2179 Mumbai 697 Chennai 381 Name: Source, dtype: int64
Source = featured_dataset[["Source"]]
Source = pd.get_dummies(Source, drop_first= True)
Source
| Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 1 | 0 |
| 2 | 0 | 1 | 0 | 0 |
| 3 | 0 | 0 | 1 | 0 |
| 4 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... |
| 10457 | 0 | 0 | 1 | 0 |
| 10458 | 0 | 0 | 1 | 0 |
| 10459 | 0 | 0 | 0 | 0 |
| 10460 | 0 | 0 | 0 | 0 |
| 10461 | 0 | 1 | 0 | 0 |
10462 rows × 4 columns
featured_dataset["Destination"].value_counts()
Cochin 4345 Banglore 2860 Delhi 1265 New Delhi 914 Hyderabad 697 Kolkata 381 Name: Destination, dtype: int64
Destination = featured_dataset[["Destination"]]
Destination = pd.get_dummies(Destination, drop_first = True)
Destination
| Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... |
| 10457 | 0 | 0 | 0 | 0 | 0 |
| 10458 | 0 | 0 | 0 | 0 | 0 |
| 10459 | 0 | 1 | 0 | 0 | 0 |
| 10460 | 0 | 0 | 0 | 0 | 1 |
| 10461 | 1 | 0 | 0 | 0 | 0 |
10462 rows × 5 columns
featured_dataset[["Route", "Additional_Info"]]
| Route | Additional_Info | |
|---|---|---|
| 0 | BLR → DEL | No info |
| 1 | CCU → IXR → BBI → BLR | No info |
| 2 | DEL → LKO → BOM → COK | No info |
| 3 | CCU → NAG → BLR | No info |
| 4 | BLR → NAG → DEL | No info |
| ... | ... | ... |
| 10457 | CCU → BLR | No info |
| 10458 | CCU → BLR | No info |
| 10459 | BLR → DEL | No info |
| 10460 | BLR → DEL | No info |
| 10461 | DEL → GOI → BOM → COK | No info |
10462 rows × 2 columns
featured_dataset.drop(["Route", "Additional_Info"], axis = 1, inplace = True)
featured_dataset
| Airline | Source | Destination | Total_Stops | Price | Journey_day | Journey_month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | non-stop | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 |
| 1 | Air India | Kolkata | Banglore | 2 stops | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 |
| 2 | Jet Airways | Delhi | Cochin | 2 stops | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 |
| 3 | IndiGo | Kolkata | Banglore | 1 stop | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 |
| 4 | IndiGo | Banglore | New Delhi | 1 stop | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | Air Asia | Kolkata | Banglore | non-stop | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 2 | 30 |
| 10458 | Air India | Kolkata | Banglore | non-stop | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 2 | 35 |
| 10459 | Jet Airways | Banglore | Delhi | non-stop | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 3 | 0 |
| 10460 | Vistara | Banglore | New Delhi | non-stop | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 2 | 40 |
| 10461 | Air India | Delhi | Cochin | 2 stops | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 8 | 20 |
10462 rows × 13 columns
featured_dataset["Total_Stops"].value_counts()
1 stop 5625 non-stop 3475 2 stops 1318 3 stops 43 4 stops 1 Name: Total_Stops, dtype: int64
featured_dataset.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True)
featured_dataset
| Airline | Source | Destination | Total_Stops | Price | Journey_day | Journey_month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | 0 | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 |
| 1 | Air India | Kolkata | Banglore | 2 | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 |
| 2 | Jet Airways | Delhi | Cochin | 2 | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 |
| 3 | IndiGo | Kolkata | Banglore | 1 | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 |
| 4 | IndiGo | Banglore | New Delhi | 1 | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | Air Asia | Kolkata | Banglore | 0 | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 2 | 30 |
| 10458 | Air India | Kolkata | Banglore | 0 | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 2 | 35 |
| 10459 | Jet Airways | Banglore | Delhi | 0 | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 3 | 0 |
| 10460 | Vistara | Banglore | New Delhi | 0 | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 2 | 40 |
| 10461 | Air India | Delhi | Cochin | 2 | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 8 | 20 |
10462 rows × 13 columns
featured_dataset.drop(["Airline", "Source", "Destination"], axis = 1, inplace = True)
featured_dataset = pd.concat([featured_dataset, Airline, Source, Destination], axis = 1)
featured_dataset
| Total_Stops | Price | Journey_day | Journey_month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | ... | Airline_Vistara Premium economy | Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3897 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 2 | 7662 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 2 | 13882 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 3 | 1 | 6218 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1 | 13302 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | 0 | 4107 | 9 | 4 | 19 | 55 | 22 | 25 | 2 | 30 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10458 | 0 | 4145 | 27 | 4 | 20 | 45 | 23 | 20 | 2 | 35 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10459 | 0 | 7229 | 27 | 4 | 8 | 20 | 11 | 20 | 3 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 10460 | 0 | 12648 | 1 | 3 | 11 | 30 | 14 | 10 | 2 | 40 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 10461 | 2 | 11753 | 9 | 5 | 10 | 55 | 19 | 15 | 8 | 20 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
10462 rows × 30 columns
featured_dataset.shape
(10462, 30)
featured_dataset.columns
Index(['Total_Stops', 'Price', 'Journey_day', 'Journey_month', 'Dep_hour',
'Dep_min', 'Arrival_hour', 'Arrival_min', 'Duration_hours',
'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
'Airline_Jet Airways', 'Airline_Jet Airways Business',
'Airline_Multiple carriers',
'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',
'Destination_Kolkata', 'Destination_New Delhi'],
dtype='object')
X = featured_dataset.drop('Price',axis=1)
y = featured_dataset.Price
X
| Total_Stops | Journey_day | Journey_month | Dep_hour | Dep_min | Arrival_hour | Arrival_min | Duration_hours | Duration_mins | Airline_Air India | ... | Airline_Vistara Premium economy | Source_Chennai | Source_Delhi | Source_Kolkata | Source_Mumbai | Destination_Cochin | Destination_Delhi | Destination_Hyderabad | Destination_Kolkata | Destination_New Delhi | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 24 | 3 | 22 | 20 | 1 | 10 | 2 | 50 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 2 | 1 | 5 | 5 | 50 | 13 | 15 | 7 | 25 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 2 | 9 | 6 | 9 | 25 | 4 | 25 | 19 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 3 | 1 | 12 | 5 | 18 | 5 | 23 | 30 | 5 | 25 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1 | 1 | 3 | 16 | 50 | 21 | 35 | 4 | 45 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10457 | 0 | 9 | 4 | 19 | 55 | 22 | 25 | 2 | 30 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10458 | 0 | 27 | 4 | 20 | 45 | 23 | 20 | 2 | 35 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10459 | 0 | 27 | 4 | 8 | 20 | 11 | 20 | 3 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 10460 | 0 | 1 | 3 | 11 | 30 | 14 | 10 | 2 | 40 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 10461 | 2 | 9 | 5 | 10 | 55 | 19 | 15 | 8 | 20 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
10462 rows × 29 columns
y
0 3897
1 7662
2 13882
3 6218
4 13302
...
10457 4107
10458 4145
10459 7229
10460 12648
10461 11753
Name: Price, Length: 10462, dtype: int64
corr_train = featured_dataset.corr()
fig = px.imshow(corr_train, text_auto=False, aspect="auto", color_continuous_scale='RdBu_r')
fig.update_layout(title_text='Correlation Matrix', title_x=0.5, width=950, height=800)
fig.show()
selection = ExtraTreesRegressor()
selection.fit(X, y)
ExtraTreesRegressor()
print(selection.feature_importances_)
[2.16785309e-01 1.43151497e-01 5.47809340e-02 2.47212457e-02 2.11006997e-02 2.84860606e-02 1.90430300e-02 1.07227283e-01 1.66962020e-02 1.44775995e-02 1.74697325e-03 1.28527384e-02 1.46490655e-01 6.90802457e-02 2.07519835e-02 9.82103215e-04 2.63160215e-03 8.70467831e-05 5.29589423e-03 7.53573949e-05 4.91313307e-04 1.38024257e-02 3.18404603e-03 7.90476378e-03 1.75936169e-02 1.89867091e-02 5.43225382e-03 5.41847111e-04 2.55985634e-02]
feat_importances = pd.DataFrame(selection.feature_importances_, index=X.columns).reset_index()
feat_importances.columns = ["Features", "Value"]
feat_importances.sort_values(by="Value", ascending=False, inplace=True)
fig = px.bar(feat_importances[:20], y='Features', x='Value', orientation="h",width=950,height=800, labels={"Features" : "Features", "Value" : "Importance Value"})
fig.update_layout(title_text='Top 20 Feature Importance', title_x=0.5)
fig.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
Here we are making function to apply 6 different regression models.
| Models |
|---|
| Linear Regression |
| Decision Tree Regression |
| Random Forest Regression |
| KNN Regression |
| AdaBoost Regression |
| XGBoost Regression |
base_evaluation_model = base_models(X, y, test_size=0.2, random_state=42)
for model_name, results in base_evaluation_model.items():
print(f"Model: {model_name}")
print("MSE Train:", results['MSE Train'])
print("MAE Train:", results['MAE Train'])
print("R2 Score of Train:", results['R2 train'])
print("MSE Test:", results['MSE Test'])
print("MAE Test:", results['MAE Test'])
print("R2 Score of Test:", results['R2 test'])
print("\n")
Model: Linear Regression MSE Train: 7913320.308500698 MAE Train: 1951.2237426775362 R2 Score of Train: 0.6322527159838631 MSE Test: 8678835.045869935 MAE Test: 1998.8996862538004 R2 Score of Test: 0.5837544362210152 Model: Decision Tree Regression MSE Train: 649937.8988329949 MAE Train: 321.0855538296093 R2 Score of Train: 0.9697961300999993 MSE Test: 6087372.769154457 MAE Test: 1390.4996416626852 R2 Score of Test: 0.7080435453793611 Model: Random Forest Regression MSE Train: 1003051.193261262 MAE Train: 577.4648843246144 R2 Score of Train: 0.9533862730597767 MSE Test: 3934788.876140612 MAE Test: 1183.5603525584145 R2 Score of Test: 0.8112836105947379 Model: KNN Regression MSE Train: 5617587.158441869 MAE Train: 1463.0492292986023 R2 Score of Train: 0.738939871039754 MSE Test: 9525326.000191115 MAE Test: 1900.8498805542283 R2 Score of Test: 0.5431558878383145 Model: AdaBoost Regression MSE Train: 12896355.918013288 MAE Train: 3026.168945155245 R2 Score of Train: 0.400681423515715 MSE Test: 14186365.097454576 MAE Test: 3086.5008450610626 R2 Score of Test: 0.3196078152476748 Model: XGBoost Regression MSE Train: 1406985.5897153183 MAE Train: 826.6941142739155 R2 Score of Train: 0.9346146612172604 MSE Test: 3357933.3573675444 MAE Test: 1139.9830465740527 R2 Score of Test: 0.8389501752156404
| Models | R2 Score(Train) | R2 Score(Test) |
|---|---|---|
| Linear Regression | 0.63 | 0.58 |
| Decision Regression | 0.96 | 0.71 |
| Random Forest Regression | 0.95 | 0.81 |
| KNN Regression | 0.73 | 0.54 |
| AdaBoost Regression | 0.44 | 0.35 |
| XGBoost Regression | 0.93 | 0.83 |
hyperparameter_tuning_RandomForest(X, y, test_size=0.2, random_state=42)
Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best Parameters: {'n_estimators': 1000, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30}
Best Score: 0.7498123941153614
Test Set Score: 0.8262424081261696
hyper_tuned_random_forest = RandomForestRegressor(n_estimators= 800, min_samples_split= 5, min_samples_leaf= 1, max_depth= None)
hyper_tuned_random_forest.fit(X_train, y_train)
RandomForestRegressor(min_samples_split=5, n_estimators=800)
y_pred = hyper_tuned_random_forest.predict(X_test)
plot_test_pred_graph(y_test, y_pred, "Hypertuned Random Forest Regressor")
print("R2 Score - ", r2_score(y_test, y_pred))
print("MAE - ", mean_absolute_error(y_test, y_pred))
print("MAE - ", mean_squared_error(y_test, y_pred))
R2 Score - 0.8261847091032793 MAE - 1144.8168250263202 MAE - 3624096.85390309
hyperparameter_tuning_XGBoost(X, y, test_size=0.2, random_state=42)
Fitting 50 folds for each of 10 candidates, totalling 500 fits
Best Parameters: {'subsample': 1.0, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 1.0}
Best Score: 0.8417882784568909
Test Set Score: 0.8382307637716018
hyper_tuned_xgb = XGBRegressor(subsample= 1.0, reg_lambda = 0.5, reg_alpha = 0.5, n_estimators = 800, max_depth = 5, learning_rate = 0.05, gamma = 1, colsample_bytree = 1.0)
hyper_tuned_xgb.fit(X_train, y_train)
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=1, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=800, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0.5, reg_lambda=0.5, ...)
y_pred = hyper_tuned_xgb.predict(X_test)
plot_test_pred_graph(y_test, y_pred, "Hypertuned XGBoost Regressor")
print("R2 Score - ", r2_score(y_test, y_pred))
print("MAE - ", mean_absolute_error(y_test, y_pred))
print("MAE - ", mean_squared_error(y_test, y_pred))
R2 Score - 0.8435359209747264 MAE - 1137.789876574258 MAE - 3262319.290891782
| Models | R2 Score(Test) |
|---|---|
| Random Forest Regression | 0.82 |
| XGBoost Regression | 0.84 |
In the world of contemporary air travel, the goal of roughly estimating airline fares in advance has enormous importance. This practice improves financial readiness, flexibility, and the entire travel experience by enabling travelers to anticipate and prepare for fare variations. Airlines stand to gain from greater customer happiness, enhanced revenue management, and the potential to create innovative marketing efforts. The nexus of precise pricing forecasts and proactive decision-making highlights the mutually beneficial relationship between passengers looking for affordable options and airlines aiming for optimal revenue optimization. Approximating flight fares in advance emerges as a crucial step towards a more open, customer-focused, and economically effective travel environment as technology and data-driven insights continue to transform the aviation business. In future work, one of the things that can be done for the same data can be inter-airline price competition and real-time pricing insights.